repurposing drugs on a hetnet

by Siyue Wu
Mentor: Andrew Su, Benjamin Good
August 5th, 2016

Background

Main Algorithm

# to see the dimension of all features matrix
dim(tran_df)
## [1] 3775 2070
3775 observation and 2070 features
Graph: the pattern for 0 value in the matrix
############ do the correlation between column
par(mfrow=c(1,1))
#####
# draw the correlation between degree columns
p0=data.frame(select(data.frame(X_dwpc), starts_with('degree')))
c0=cor(p0, use = "complete")
# the cut off of high correlation is 0.80
rem0=findCorrelation(c0, cutoff = .80)
p0.rem=cor(p0[,rem0] , use="complete")
p0.plot=corrplot(p0.rem,  method = 'ellipse' , type="lower" , 
                 order="FPC",tl.col = "black")
Graph: the high correlation plot (correlation <0.80) for degree feature (second method) columns
# draw the correlation between dwpc columns
p1=data.frame(select(data.frame(X_dwpc), starts_with('dwpc')))
c1=cor(p1, use = "complete")
# the cut off =0.90 means only get out of the correlation >=0.90
rem1=findCorrelation(c1, cutoff = .95)
# since there are lots of features have correlation >=0.90 (length(rem1)=586)
# I use for loop to plot all out, each plot has only 10 columns, so we total have
# ceiling(length(rem1)/10) output graphs for high correlation columns(correlation>0.9)
j=30
for(i in 1:ceiling(length(rem1)/j)){
  number=pmin(j*i,length(rem1))
  p1.rem=cor(p1[,rem1[((i-1)*j):number]] , use="complete")
  p1.plot=corrplot(p1.rem,  method = 'ellipse' , type="lower" , 
                   order="alphabet",tl.cex=0.6)
}
All 11 Graph: the high correlation (correlation <0.95) plot for DWPCs feature columns
## 
## ----------------------------------------------------
##  Lasso.logistic   Random.Forest   Gredient.Boosting 
## ---------------- --------------- -------------------
##      0.3321          0.3264            0.2795       
## ----------------------------------------------------
# plot roc curve 
plot(glmnetPlotRoc,col="red",lty=1, lwd=2,main="ROC curves")
plot(randomForestPlotRoc,col="blue",lty=1, lwd=2,add=T)
plot(xgboostPlotRoc,col="green",lty=1, lwd=2,add=T)
legend(0.56,0.2, legend = c("logisticRegression", "randomforest","gradientBoosting"), lty=1,
       lwd=2,col=c("red","blue" ,"green"),cex = 1,bty ="n")
legend(0.3,0.6,c(paste(c("AUROC for logistic = ","AUROC for randomforest = ",
                         "AUROC for gradientboosting = "),
                       c(round(glmnetAuroc, digits = 2),round(randomForestAuroc,digits=2),
                         round(xgboostAuroc,digits=2)),sep=""),"\n"),
                        border="white",cex=0.8,box.col = "white")

# plot prc curve 
plot(glmnetPlotPrc,col="red",lty=1, lwd=2,main="PRC curves")
plot(randomForestPlotPrc,col="blue",lty=1, lwd=2,add=T)
plot(xgboostPlotPrc,col="green",lty=1, lwd=2,add=T)
legend(0.1,0.2, legend = c("logisticRegression", "randomforest","gradientBoosting"), lty=1,
       lwd=2,col=c("red","blue" ,"green"),cex = 1,bty ="n")
legend(0.1,0.45,c(paste(c("AUPRC for logistic = ","AURPC for randomforest = ",
                          "AUPRC for gradientboosting = "),
                        c(round(glmnetAuprc, digits = 2),round(randomForestAuprc,digits=2),
                          round(xgboostAuprc,digits=2)),sep=""),"\n"),
       border="white",cex=0.8,box.col = "white")

Thank you!